library(edgeR)
library(ggplot2)
library(readr)
library(here)
# Create output directories if they don't exist
dir.create(file.path("edgeR"), showWarnings = FALSE)
dir.create(file.path("edgeR_transcript"), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path("edgeR_gene"), showWarnings = FALSE, recursive = TRUE)
Transcripts
Load count data - Transcripts
counts <- read.csv("/Volumes/sheynkman/projects/LRP_Mohi_project/03_filter_sqanti/MDS_filtered_raw_counts.tsv",
header = TRUE, row.names = 1)
group <- factor(c("Q157R", "Q157R", "Q157R", "WT", "WT", "WT"))
dge_raw <- DGEList(counts = counts, group = group)
Visualize raw counts
boxplot(dge_raw$counts, main = "Boxplot of Raw Counts", las = 2, col = as.numeric(group))
plotMDS(dge_raw, col = as.numeric(group), main = "MDS Plot for Raw Counts")
plotMD(dge_raw, col = as.numeric(group), main = "MD Plot for Raw Counts")
abline(h = 0, col = "red", lty = 2, lwd = 2)
MD (mean-difference) plots per sample
par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
for (i in 1:ncol(dge_raw$counts)) {
plotMD(dge_raw, column = i, main = paste("MD Plot for Sample", i))
abline(h = 0, col = "red", lty = 2, lwd = 2)
}
par(mfrow = c(1, 1)) # Reset plotting area to single panel
Filter lowly expressed transcripts (required for modeling)
keep <- filterByExpr(dge_raw)
table(keep)
## keep
## FALSE TRUE
## 292011 37723
dge <- dge_raw[keep, , keep.lib.sizes = FALSE]
Normalize Counts with TMM
dge <- calcNormFactors(dge, method = "TMM")
Save counts matrices
# Save dge_raw and dge_norm objects for downstream correlation analyses
save(dge_raw, file = file.path("edgeR_transcript", "dte_raw.rda"))
save(dge, file = file.path("edgeR_transcript", "dte_norm.rda"))
Visualize normalized counts
boxplot(dge$counts, main = "Boxplot of Normalized Counts")
log_cpm_tx <- cpm(dge, log = TRUE, prior.count = 1)
boxplot(log_cpm_tx,
main = "Boxplot of log2(CPM + 1) - Transcripts",
las = 2,
col = as.numeric(group))
, las = 2, col = as.numeric(group)) plotMDS(dge, col =
as.numeric(group), main = “MDS Plot for Normalized Counts”) plotMD(dge,
col = as.numeric(group), main = “MD Plot for Normalized Counts”)
abline(h = 0, col = “red”, lty = 2, lwd = 2)
Design matrix and dispersion
``` r
design <- model.matrix(~ group)
rownames(design) <- colnames(dge)
dge_disp <- estimateDisp(dge, design)
dge_disp$common.dispersion
## [1] 0.07809944
plotBCV(dge_disp)
Fit model and test
fit <- glmQLFit(dge_disp, design)
plotQLDisp(fit)
result <- glmQLFTest(fit, coef = 2)
topTags(result)
## Coefficient: groupWT
## logFC logCPM F PValue FDR
## PB.21599.2 2.592444 4.875611 121.33363 4.501609e-07 0.01590197
## PB.9626.41 2.657393 3.640055 106.58645 8.430913e-07 0.01590197
## PB.17614.66 5.520253 1.697234 61.15331 2.148668e-06 0.01611877
## PB.20180.8 2.300030 4.451426 84.60091 2.509844e-06 0.01611877
## PB.16163.1 2.290359 5.648489 82.23354 2.864402e-06 0.01611877
## PB.11107.2 2.308405 4.354890 81.53789 2.982938e-06 0.01611877
## PB.11890.1 2.275614 4.935824 79.41331 3.371832e-06 0.01611877
## PB.2313.1 2.059830 7.696301 74.94752 4.409125e-06 0.01611877
## PB.24052.20 8.827273 2.260153 107.02258 4.458135e-06 0.01611877
## PB.3791.1 2.301812 3.958942 73.81950 4.737358e-06 0.01611877
FDR and DEG Summary
FDR <- p.adjust(result$table$PValue, method = "BH")
sum(FDR < 0.05)
## [1] 104
qlf <- glmQLFTest(fit)
top <- rownames(topTags(qlf))
cpm(dge_disp)[top, ]
## BioSample_1 BioSample_2 BioSample_3 BioSample_4 BioSample_5
## PB.21599.2 7.893988 7.636693 9.13440 42.179837 59.923100
## PB.9626.41 3.633741 3.369129 2.89256 20.906528 19.708042
## PB.17614.66 0.000000 0.000000 0.30448 4.768155 5.459660
## PB.20180.8 6.390371 7.187475 8.22096 28.058761 39.016596
## PB.16163.1 17.542196 11.679647 21.16136 69.321645 95.477472
## PB.11107.2 7.016878 4.941389 8.06872 29.342495 36.885997
## PB.11890.1 11.026523 8.085910 11.72248 39.795759 48.337967
## PB.2313.1 92.221831 62.665801 85.10216 234.006400 348.619278
## PB.24052.20 0.000000 0.000000 0.00000 6.051890 8.389234
## PB.3791.1 4.134946 4.492172 6.69856 24.024168 24.501890
## BioSample_6
## PB.21599.2 47.087966
## PB.9626.41 22.262678
## PB.17614.66 6.887015
## PB.20180.8 40.200951
## PB.16163.1 83.445002
## PB.11107.2 34.274914
## PB.11890.1 62.463629
## PB.2313.1 419.467292
## PB.24052.20 12.172399
## PB.3791.1 27.387899
summary(decideTests(qlf))
## groupWT
## Down 7
## NotSig 37619
## Up 97
plotMD(qlf)
abline(h = c(-1, 1), col = "blue")
Save DEG results & intermediate files
deg_results <- topTags(result, n = Inf)$table
write.table(deg_results,
file = file.path("edgeR_transcript", "transcript_DEG_results.txt"),
sep = "\t", quote = FALSE, row.names = TRUE)
# Raw and normalized CPM Matrices
write.table(cpm(dge_raw),
file = file.path("edgeR_transcript", "raw_CPM_matrix.txt"),
sep = "\t", quote = FALSE)
write.table(cpm(dge),
file = file.path("edgeR_transcript", "normalized_CPM_matrix.txt"),
sep = "\t", quote = FALSE)
# List of filtered-in transcripts
write.table(rownames(dge),
file = file.path("edgeR_transcript", "filtered_transcripts.txt"),
quote = FALSE, row.names = FALSE, col.names = FALSE)
# Design matrix to keep record of the model used
write.table(design,
file = file.path("edgeR_transcript", "design_matrix.txt"),
sep = "\t", quote = FALSE, row.names = TRUE, col.names = NA)
# Top genes table
top_genes <- topTags(qlf, n = 50)$table
write.table(top_genes,
file = file.path("edgeR_transcript", "top_transcripts.txt"),
sep = "\t", quote = FALSE, row.names = TRUE)
# Save dispersion plots (BCV and QL)
png(file.path("edgeR_transcript", "BCV_plot.png"), width = 800, height = 600)
plotBCV(dge_disp)
dev.off()
## quartz_off_screen
## 2
png(file.path("edgeR_transcript", "QLDisp_plot.png"), width = 800, height = 600)
plotQLDisp(fit)
dev.off()
## quartz_off_screen
## 2
# MDS plot (normalized)
png(file.path("edgeR_transcript", "MDS_plot.png"), width = 800, height = 600)
plotMDS(dge, col = as.numeric(group), main = "MDS Plot for Normalized Counts")
dev.off()
## quartz_off_screen
## 2
Volcano Plot - left = downregulated in Q157R vs. WT (higher in WT); right = up-regulated in Q157R vs. WT (higher in Q157R)
deg_results$Gene <- rownames(deg_results)
ggplot(deg_results, aes(x = logFC, y = -log10(PValue))) +
geom_point(aes(color = FDR < 0.05)) +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")
ggsave(file.path("edgeR_transcript", "transcript_volcano_plot.png"),
width = 6, height = 6, dpi = 300)
Interactive volcano plot
library(ggiraph)
library(plotly)
gg <- ggplot(deg_results, aes(x = logFC, y = -log10(PValue),
tooltip = Gene, data_id = Gene)) +
geom_point_interactive(aes(color = FDR < 0.05)) +
scale_color_manual(values = c("black", "red")) +
theme_minimal() +
labs(title = "Interactive Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")
girafe(ggobj = gg, width_svg = 10, height_svg = 6)